"""
完成练习案例：单词计数统计
"""
from pyspark import SparkConf, SparkContext
import os

os.environ['PYSPARK_PYTHON'] = 'D:/Work/anaconda3/python.exe'

# 构建执行环境入口对象
conf = SparkConf().setMaster("local[*]").setAppName("WordCount")
sc = SparkContext(conf=conf)

# 读取数据文件
rdd = sc.textFile("hello.txt")

# 取出全部单词
word_rdd = rdd.flatMap(lambda x: x.split(" "))
# 将所有单词都转换成二元元组，单词为key，value值为1，把每个单词都变成一个元组，方便后面count计数
# 即(hello,1), (spark, 1), (itheima, 1)...
word_with_one_rdd = word_rdd.map(lambda x: (x, 1))
# 分组并求和
result_rdd = word_with_one_rdd.reduceByKey(lambda x, y: x + y)

# 打印输出结果
print(result_rdd.collect())


# 链式编程写法
# result_rdd = rdd.flatMap(lambda x: x.split(" ")).map(lambda x: (x, 1)).reduceByKey(lambda x, y: x + y)
# print(result_rdd.collect())

sc.stop()




